This ipython file is the project by Hongyi Tang and Weijian Li for course 12752. There are four ipython files in the project in total. Each file consist of one cluster analysis task. In this file, the cluster analysis is demonstrated to 5 building types.



In [1]:

    
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle

%matplotlib inline

Please download the dataset and change the file path.



In [2]:

    
# Read in CBECS data
f = open('C:/F16-12-752-master/projects/thongyi_weijian1/data/CBECS.csv')
data = pd.read_csv(f,sep=',', header='infer', parse_dates=[1])
#data = pd.DataFrame.from_csv('../../lectures/data/CBECS.csv') 
data = data.set_index('PUBID')
data.tail()









    Out[2]:






  
    
      
      REGION
      CENDIV
      PBA
      FREESTN
      SQFT
      SQFTC
      WLCNS
      RFCNS
      RFCOOL
      RFTILT
      ...
      FKCLBTU
      FKWTBTU
      FKCKBTU
      FKOTBTU
      DHHTBTU
      DHCLBTU
      DHWTBTU
      DHCKBTU
      DHOTBTU
      PUBCLIM
    
    
      PUBID
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      6716
      3
      5
      14
      1.0
      108000
      7
      1
      6
      2
      1
      ...
      0.0
      0.0
      0.0
      0.0
      NaN
      NaN
      NaN
      NaN
      NaN
      2
    
    
      6717
      3
      7
      5
      1.0
      1700
      2
      5
      5
      2
      2
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      2
    
    
      6718
      2
      3
      26
      1.0
      2000
      2
      1
      4
      2
      2
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      1
    
    
      6719
      1
      2
      12
      1.0
      19250
      4
      1
      4
      2
      1
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      2
    
    
      6720
      3
      5
      14
      1.0
      142000
      7
      1
      1
      1
      2
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      2
    
  

5 rows × 1118 columns

Office, inpatient health care, service, public assembly and education buildings are selected.



In [74]:

    
energydata=pd.DataFrame()

type_B=[2,16,26,13,14] # office, inpatient health care, service, public assembly and education
type_C=[1,3,4,5,6,7,8,9,10,11,12,15,17,18,19,20,21,22,23,24,25,91]

data_type=data
data_type=data_type[data_type.NGUSED!=2]

for i in type_C:
    data_type=data_type[data_type.PBA!=i]

energydata['Building Type']=data_type.PBA

index=['ELBTU','NGBTU','ELVNBTU','NGHTBTU']
for i in index:
    energydata[i]=data_type[i]/data_type.SQFT
energydata









    Out[74]:






  
    
      
      Building Type
      ELBTU
      NGBTU
      ELVNBTU
      NGHTBTU
    
    
      PUBID
      
      
      
      
      
    
  
  
    
      7
      16
      111.653990
      51.838735
      23.739538
      0.000000
    
    
      9
      2
      59.424400
      19.931467
      17.621520
      16.893707
    
    
      10
      2
      91.791085
      0.222767
      21.699827
      0.000000
    
    
      11
      2
      42.062146
      20.280000
      18.361463
      19.146244
    
    
      13
      26
      34.994000
      6.896857
      2.202000
      6.896857
    
    
      15
      14
      62.133773
      17.110851
      4.380262
      9.513459
    
    
      16
      14
      44.434808
      35.471904
      4.091010
      25.715029
    
    
      18
      2
      10.216667
      8.183000
      2.971167
      8.183000
    
    
      20
      2
      37.380282
      22.818521
      14.557141
      21.278775
    
    
      23
      13
      22.276267
      9.973267
      3.859567
      9.730733
    
    
      24
      16
      84.670228
      143.382582
      14.037165
      67.848323
    
    
      30
      14
      23.974024
      34.225133
      2.972892
      30.004072
    
    
      31
      16
      52.721559
      0.449975
      12.871851
      0.000000
    
    
      35
      14
      20.924827
      38.079383
      2.216235
      32.082160
    
    
      36
      2
      30.185778
      8.474478
      13.816567
      7.493933
    
    
      37
      13
      25.851400
      23.329000
      0.672800
      22.582200
    
    
      47
      16
      112.065507
      36.251796
      32.019704
      0.000000
    
    
      51
      2
      41.638982
      29.714506
      14.336681
      20.583518
    
    
      54
      16
      116.452243
      129.202617
      19.629003
      7.006450
    
    
      56
      14
      23.226000
      50.701273
      1.994788
      17.707616
    
    
      64
      13
      61.291410
      153.400525
      9.889967
      152.621377
    
    
      67
      14
      17.523857
      0.834667
      1.261571
      0.603048
    
    
      69
      2
      111.319939
      3.025333
      33.627515
      0.912000
    
    
      70
      13
      45.290962
      2.109231
      1.837308
      0.069231
    
    
      74
      16
      138.391714
      97.729357
      17.607966
      48.233034
    
    
      76
      2
      76.080885
      2.363548
      25.345914
      0.000000
    
    
      81
      14
      42.551833
      68.709167
      4.299833
      58.528542
    
    
      85
      26
      1.971039
      29.858182
      0.175065
      25.146883
    
    
      92
      14
      24.616567
      6.623077
      4.013740
      6.623077
    
    
      95
      13
      39.543274
      1.249421
      2.296225
      0.700909
    
    
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      6611
      2
      56.035500
      38.796500
      11.106500
      31.261500
    
    
      6617
      16
      119.570520
      93.806463
      18.372153
      67.871923
    
    
      6618
      16
      93.863391
      122.775413
      13.017591
      97.630818
    
    
      6619
      16
      95.567550
      0.193555
      15.845235
      0.000000
    
    
      6624
      13
      21.094286
      109.480000
      2.070000
      105.707619
    
    
      6629
      2
      186.984667
      20.875852
      36.775889
      19.644852
    
    
      6637
      16
      178.824627
      37.098507
      7.642415
      0.000000
    
    
      6643
      14
      55.829700
      414.871325
      2.651900
      316.689950
    
    
      6648
      14
      26.283905
      44.791524
      2.631619
      35.409929
    
    
      6651
      14
      35.292222
      18.194889
      2.715422
      18.194889
    
    
      6656
      14
      34.797421
      54.591825
      7.462667
      29.633421
    
    
      6659
      2
      23.011429
      42.522857
      2.783429
      42.522857
    
    
      6662
      14
      70.270060
      1.818155
      12.736560
      1.027964
    
    
      6675
      16
      52.161962
      143.888599
      9.273709
      43.307132
    
    
      6681
      14
      41.022364
      46.330873
      5.011568
      39.805356
    
    
      6683
      2
      47.802000
      52.958667
      9.151333
      52.958667
    
    
      6688
      14
      34.410140
      42.106047
      2.576000
      31.000186
    
    
      6692
      14
      61.797682
      40.177209
      4.475609
      40.177209
    
    
      6693
      14
      50.435936
      56.027374
      7.290974
      44.929596
    
    
      6694
      26
      17.754971
      30.111571
      3.015971
      30.111571
    
    
      6695
      16
      136.860003
      0.271830
      24.284254
      0.000000
    
    
      6696
      14
      41.743009
      46.501628
      7.560744
      31.320228
    
    
      6700
      16
      76.598659
      113.545840
      13.078797
      80.291207
    
    
      6709
      14
      18.228881
      99.498966
      3.815614
      89.770335
    
    
      6710
      2
      64.932231
      7.261769
      16.085308
      7.261769
    
    
      6712
      13
      10.228545
      12.710000
      0.397364
      12.526364
    
    
      6713
      14
      51.918501
      0.315094
      13.446402
      0.000000
    
    
      6716
      14
      42.192602
      27.452917
      4.503444
      22.808935
    
    
      6718
      26
      104.144500
      135.966500
      4.339500
      127.668000
    
    
      6720
      14
      23.122232
      23.920761
      2.676141
      17.144408
    
  

2259 rows × 5 columns



In [75]:

    
energydata = energydata.dropna(how='any')
energydata = energydata[~(energydata == 0).any(axis=1)]
PBA1=energydata['Building Type'].unique()
PBA1
count=[]
for i in PBA1:
    count.append([energydata[energydata['Building Type']==i].shape[0],i])
count









    Out[75]:





[[714, 2], [244, 26], [471, 14], [226, 13], [283, 16]]



In [91]:

    
type_C=['Office','Inpatient health care','Service','Public assembly','Education']
fig1 = plt.figure(figsize=(20,15))
times=1
data_seperate=[]
# energydata[energydata['Building Type']==type_B[1]]
for i in range(len(type_B)):
    x=energydata[energydata['Building Type']==type_B[i]]
    x=x.drop(x.columns[0],axis=1)
    data_seperate.append(x)
for i in range(len(type_B)):
    plt.subplot(len(type_B),2,times)
    data_seperate[i].boxplot()
    times=times+1
    plt.title(type_C[i])
    plt.ylim(0,200)









    



C:\Users\Daniel\Anaconda3\lib\site-packages\ipykernel\__main__.py:12: FutureWarning: 
The default value for 'return_type' will change to 'axes' in a future release.
 To use the future behavior now, set return_type='axes'.
 To keep the previous behavior and silence this warning, set return_type='dict'.



In [80]:

    
y=pd.DataFrame()
for i in range(len(type_B)):
    y=y.append(data_seperate[i])
X=y.as_matrix().astype(np.float32)
from sklearn.cluster import KMeans
num_clust = 5
clusters = KMeans(n_clusters=num_clust).fit(X)
cluster_assignments = clusters.predict(X)
# plt.subplot(num_clust+1,1,1)
# plt.plot(cluster_assignments[:150])
# plt.ylim([0.2,1.1])
fig2 = plt.figure(figsize=(20,15))
for cluster_id in range(len(clusters.cluster_centers_)):
    plt.subplot(num_clust,2,cluster_id+1)
    cluster_members = X[cluster_assignments==cluster_id,:]
    print(len(cluster_members))
    for i in range(len(cluster_members)):
        plt.plot(cluster_members[i,:], color='grey', lw='0.1')
    plt.plot(clusters.cluster_centers_[cluster_id,:], color='k', lw='1')



In [81]:

    
# plot stem figure to see the partten and outlier
y['assignment']=cluster_assignments
y=y[y.assignment!=4]
del y['assignment']
X=y.as_matrix().astype(np.float32)



In [82]:

    
num_clust = 5
clusters = KMeans(n_clusters=num_clust).fit(X)
cluster_assignments = clusters.predict(X)
# plt.subplot(num_clust+1,1,1)
# plt.plot(cluster_assignments[:150])
# plt.ylim([0.2,1.1])
fig2 = plt.figure(figsize=(20,15))
for cluster_id in range(len(clusters.cluster_centers_)):
    plt.subplot(num_clust+1,2,cluster_id+1)
    cluster_members = X[cluster_assignments==cluster_id,:]
    print(len(cluster_members))
    for i in range(len(cluster_members)):
        plt.plot(cluster_members[i,:], color='grey', lw='0.1')
    plt.plot(clusters.cluster_centers_[cluster_id,:], color='k', lw='1')



In [83]:

    
# plot stem figure to see the partten and outlier
y['assignment']=cluster_assignments
y=y[y.assignment!=4]
del y['assignment']
X=y.as_matrix().astype(np.float32)



In [84]:

    
num_clust = 5
clusters = KMeans(n_clusters=num_clust).fit(X)
cluster_assignments = clusters.predict(X)
# plt.subplot(num_clust+1,1,1)
# plt.plot(cluster_assignments[:150])
# plt.ylim([0.2,1.1])
fig2 = plt.figure(figsize=(20,15))
for cluster_id in range(len(clusters.cluster_centers_)):
    plt.subplot(num_clust+1,2,cluster_id+1)
    cluster_members = X[cluster_assignments==cluster_id,:]
    print(len(cluster_members))
    for i in range(len(cluster_members)):
        plt.plot(cluster_members[i,:], color='grey', lw='0.1')
    plt.plot(clusters.cluster_centers_[cluster_id,:], color='k', lw='1')



In [85]:

    
# plot stem figure to see the partten and outlier
y['assignment']=cluster_assignments
y=y[y.assignment!=2]
del y['assignment']
X=y.as_matrix().astype(np.float32)



In [86]:

    
num_clust = 5
clusters = KMeans(n_clusters=num_clust).fit(X)
cluster_assignments = clusters.predict(X)
# plt.subplot(num_clust+1,1,1)
# plt.plot(cluster_assignments[:150])
# plt.ylim([0.2,1.1])
fig2 = plt.figure(figsize=(20,15))
for cluster_id in range(len(clusters.cluster_centers_)):
    plt.subplot(num_clust+1,2,cluster_id+1)
    cluster_members = X[cluster_assignments==cluster_id,:]
    print(len(cluster_members))
    for i in range(len(cluster_members)):
        plt.plot(cluster_members[i,:], color='grey', lw='0.1')
    plt.plot(clusters.cluster_centers_[cluster_id,:], color='k', lw='1')

The connection between assignment and building type need to be found by visually judging. The connecting can be different every time the file was run.



In [87]:

    
y['assignment']=cluster_assignments
y=y.join(data['PBA'],how='inner')
y
y['judge']=1
y['judge'].iloc[np.where(np.array(y.PBA)==2)]=0
y['judge'].iloc[np.where(np.array(y.PBA)==16)]=3
y['judge'].iloc[np.where(np.array(y.PBA)==26)]=1
y['judge'].iloc[np.where(np.array(y.PBA)==13)]=4
y['judge'].iloc[np.where(np.array(y.PBA)==14)]=2
y[y['judge']==y['assignment']].count()









    



C:\Users\Daniel\Anaconda3\lib\site-packages\pandas\core\indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)






    Out[87]:





ELBTU         727
NGBTU         727
ELVNBTU       727
NGHTBTU       727
assignment    727
PBA           727
judge         727
dtype: int64



In [88]:

    
a=727/(714+283+244+226+471)



In [89]:

    
a









    Out[89]:





0.3751289989680083



In [ ]:

	REGION	CENDIV	PBA	FREESTN	SQFT	SQFTC	WLCNS	RFCNS	RFCOOL	RFTILT	...	FKCLBTU	FKWTBTU	FKCKBTU	FKOTBTU	DHHTBTU	DHCLBTU	DHWTBTU	DHCKBTU	DHOTBTU	PUBCLIM
PUBID
6716	3	5	14	1.0	108000	7	1	6	2	1	...	0.0	0.0	0.0	0.0	NaN	NaN	NaN	NaN	NaN	2
6717	3	7	5	1.0	1700	2	5	5	2	2	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2
6718	2	3	26	1.0	2000	2	1	4	2	2	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1
6719	1	2	12	1.0	19250	4	1	4	2	1	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2
6720	3	5	14	1.0	142000	7	1	1	1	2	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2

	Building Type	ELBTU	NGBTU	ELVNBTU	NGHTBTU
PUBID
7	16	111.653990	51.838735	23.739538	0.000000
9	2	59.424400	19.931467	17.621520	16.893707
10	2	91.791085	0.222767	21.699827	0.000000
11	2	42.062146	20.280000	18.361463	19.146244
13	26	34.994000	6.896857	2.202000	6.896857
15	14	62.133773	17.110851	4.380262	9.513459
16	14	44.434808	35.471904	4.091010	25.715029
18	2	10.216667	8.183000	2.971167	8.183000
20	2	37.380282	22.818521	14.557141	21.278775
23	13	22.276267	9.973267	3.859567	9.730733
24	16	84.670228	143.382582	14.037165	67.848323
30	14	23.974024	34.225133	2.972892	30.004072
31	16	52.721559	0.449975	12.871851	0.000000
35	14	20.924827	38.079383	2.216235	32.082160
36	2	30.185778	8.474478	13.816567	7.493933
37	13	25.851400	23.329000	0.672800	22.582200
47	16	112.065507	36.251796	32.019704	0.000000
51	2	41.638982	29.714506	14.336681	20.583518
54	16	116.452243	129.202617	19.629003	7.006450
56	14	23.226000	50.701273	1.994788	17.707616
64	13	61.291410	153.400525	9.889967	152.621377
67	14	17.523857	0.834667	1.261571	0.603048
69	2	111.319939	3.025333	33.627515	0.912000
70	13	45.290962	2.109231	1.837308	0.069231
74	16	138.391714	97.729357	17.607966	48.233034
76	2	76.080885	2.363548	25.345914	0.000000
81	14	42.551833	68.709167	4.299833	58.528542
85	26	1.971039	29.858182	0.175065	25.146883
92	14	24.616567	6.623077	4.013740	6.623077
95	13	39.543274	1.249421	2.296225	0.700909
...	...	...	...	...	...
6611	2	56.035500	38.796500	11.106500	31.261500
6617	16	119.570520	93.806463	18.372153	67.871923
6618	16	93.863391	122.775413	13.017591	97.630818
6619	16	95.567550	0.193555	15.845235	0.000000
6624	13	21.094286	109.480000	2.070000	105.707619
6629	2	186.984667	20.875852	36.775889	19.644852
6637	16	178.824627	37.098507	7.642415	0.000000
6643	14	55.829700	414.871325	2.651900	316.689950
6648	14	26.283905	44.791524	2.631619	35.409929
6651	14	35.292222	18.194889	2.715422	18.194889
6656	14	34.797421	54.591825	7.462667	29.633421
6659	2	23.011429	42.522857	2.783429	42.522857
6662	14	70.270060	1.818155	12.736560	1.027964
6675	16	52.161962	143.888599	9.273709	43.307132
6681	14	41.022364	46.330873	5.011568	39.805356
6683	2	47.802000	52.958667	9.151333	52.958667
6688	14	34.410140	42.106047	2.576000	31.000186
6692	14	61.797682	40.177209	4.475609	40.177209
6693	14	50.435936	56.027374	7.290974	44.929596
6694	26	17.754971	30.111571	3.015971	30.111571
6695	16	136.860003	0.271830	24.284254	0.000000
6696	14	41.743009	46.501628	7.560744	31.320228
6700	16	76.598659	113.545840	13.078797	80.291207
6709	14	18.228881	99.498966	3.815614	89.770335
6710	2	64.932231	7.261769	16.085308	7.261769
6712	13	10.228545	12.710000	0.397364	12.526364
6713	14	51.918501	0.315094	13.446402	0.000000
6716	14	42.192602	27.452917	4.503444	22.808935
6718	26	104.144500	135.966500	4.339500	127.668000
6720	14	23.122232	23.920761	2.676141	17.144408